/*******************************************************************************
MANUSCRIPT TITLE: Economic Outcomes Among Microfinance Group Members Receiving Community-based Integrated HIV Care: Cluster Randomized Trial Evidence From Kenya
DESCRIPTION: Code used to analyze 0-18 month GISHE data for the Harambee R01 clinical trial
AUTHOR: Marta Wilson-Barthes
Analysis for: IHEA Special Issue of Social Science & Medicine on Equity in Health Care and Health: The Contribution of Health Economics

DATE STARTED: 1/5/2024
DATE LAST UPDATED: 4/13/24
*******************************************************************************/

/*******************************************************************************
**** Hypothesis: microfinance group members randomized to Arm A (receiving ICB care) will have (1) more savings (shares purchased) and (2) lower loan default rates over 18-months compared to members randomized to Arm B (standard facility care)

*******************************************************************************/
****Data Setup 

	clear all
	capture log close
	macro drop _all
	set more off, permanently
	set linesize 255

*Project directories

	global folder "P:\Omar Projects\Harambee R01\Aim 3\SSM Manuscript - Analysis of GISHE Performance\"
	global sourcedir "$folder\Source"
	global outputdir "$folder\Output"
	cd "$sourcedir"

*Log 

	log using "$sourcedir\LogGISHE_data.log", replace
	
********************************************************************************

*Open Aim 1 Data Set 7 GISHE data
use "$sourcedir\GISHE_data_clean.dta", clear	


*Table 1 - Individual GISHE data initial GISHE visit [MEANS]
preserve
drop if  month != 0
table1_mc, 	by(arm) /// 
			vars(		female_bin contn %4.2f \				///
						age contn %4.2f \						///
						mos_income_bin contn %4.2f \			///
						hfias_severe contn %4.2f \ 				///
						secondary_edu cat	\					///
						wealthindex contn %4.2f \ 				///
						gishe_months contn %4.2f \				///
						WalkingDistancekm contn %4.2f \			///
						group_active_total contn %4.2f \ 		///
						meeting_freq cat)		nospace percent_n onecol missing total(before) test saving("$sourcedir\table 1amean.xlsx", replace)

restore 

*Table 1 - Individual GISHE data initial GISHE visit [MEDIANS]
preserve
drop if  month != 0
table1_mc, 	by(arm) /// 
			vars(		female_bin cat %4.2f \					///
						age conts %4.2f \						///
						mos_income_bin cat %4.2f \				///
						hfias_severe cat %4.2f \ 				///
						secondary_edu cat %4.2f	\				///
						wealthindex conts %4.2f \ 				///
						gishe_months conts %4.2f \				///
						WalkingDistancekm conts %4.2f \			///
						group_active_total conts %4.2f \		///
						meeting_freq cat)		nospace percent_n onecol missing total(before) test saving("$sourcedir\table 1amedian.xlsx", replace)

restore 


*Table 1 - Group-level GISHE data initial GISHE visit [Median]
preserve 
keep if visit_month !="month 0"
table1_mc, 	by(arm) /// 
			vars(	num_meetings_scheduled conts %4.2f \					///	
					num_meetings conts %4.2f \	///
					prop_attended_18 conts %4.2f )		nospace percent_n onecol missing total(before) test
restore
						
distinct participant_id //855


***keep only the participants that have at least 1 follow-up GISHE visit
set sortseed 123456 
capture drop nid 
bys id: gen nid=[_N]
codebook nid //77 observations with only one data point
keep if nid>1 //77 obs deleted
tab nid //95.6% of participants have data for all 6 follow up GISHE visits over 18 months

*declare panel data
xtset id time
xtdescribe //778 individuals in data set; 7 time points

*inorder to define covariate list, data need to be complete for each time point
	*fill down the covariates at enrollment	
	set sortseed 123456
	bysort participant_id : carryforward schooling_achieved, gen(educ)
	
	replace mos_income=0 if mos_income==5 //make don't know category the lowest for easier coefficient interpretation
	bysort participant_id : carryforward mos_income, gen(mincome)

	gen distance_bin = 1 if WalkingDistancekm>=20
	replace distance_bin = 0 if WalkingDistancekm <20

	bysort participant_id: carryforward distance_bin, gen(distancebin)
	
	bysort participant_id: carryforward mos_income_bin, gen(income_bin)
	
	gen biweekly_group = 1 if meeting_freq==2
	bysort participant_id: carryforward biweekly_group, gen(biweekly)
	
	capture drop educ_bin
	bysort participant_id: carryforward secondary_edu, gen(educ_bin) 

	label var educ_bin "At least secondary education completed"
	capture label define educ1 1 "Yes"  0"No"
	label values educ_bin educ1
		
	capture drop educ_bin 
	capture drop educbin
	gen educ_bin = 1 if educ>=3
	replace educ_bin = 0 if educ <3 
	label var educ_bin "At least secondary education completed"
	capture label define educ1 1 "Yes"  0"No"
	label values educ_bin educ1

set scheme s2gcolor

*convert KES to USD (2019 World Bank exchange rate) and re-run adjusted twopm
gen total_shares_purchased_usd =  (total_shares_purchased/101.99)

*conservatively assume that those with missing data did not purchase any shares	
replace total_shares_purchased_usd=0 if total_shares_purchased_usd==. 

*create a binary indicator of spending any amount of shares
gen shares_yn = 1 if total_shares_purchased_usd >0 & total_shares_purchased_usd !=. 
replace shares_yn = 0 if total_shares_purchased_usd==0 


****Supplementary Table 1. 
*what is the probability of spending any amount based on randomization assignment?
regress shares_yn tx if time!=. //66.9% propability of >0 spending in control arm
regress shares_yn tx##time if time!=. 
estimates store regress
margins, dydx(*) //2.7 percentage points higher (from a probability of 66.9% in the standard of care arm)



****TABLE 2. 
*define covariate list 
global xlist age i.female_bin i.educ_bin i.income_bin gishe_months WalkingDistancekm no_of_meetings_attended group_active_total meeting_freq


**18-month pooled effects
bysort participant_id: egen total_shares = sum(total_shares_purchased_usd)

*unadjusted 
twopm total_shares tx if time!=. & time!=1 , firstpart(probit) secondpart(glm, family(gamma) link(log)) re vce(cluster study_group_id) 
margins if tx==0
margins if tx==1
margins, dydx(tx)
margins, dydx(*) 

*adjusted
twopm total_shares tx $xlist if time!=. & time!=1 , firstpart(probit) secondpart(glm, family(gamma) link(log)) re vce(cluster study_group_id) 
estimates store pooled
margins if tx==0
margins if tx==1
margins, dydx(tx)
margins, dydx(*) 
esttab pooled using "$sourcedir\tablepooled.csv", replace se starl( * 0.10 ** 0.05 *** 0.010) varwidth(25) label  interaction(" X ") title(Table 2: Pooled TwoPM Analysis) legend varlabels(_cons constant) stats(r2 df_r bic, fmt(3 0 1) label(R-sqr dfres BIC))



**Supplementary Table 2A and 2B. treatment-by-time 
*unadjusted
twopm total_shares_purchased_usd tx##time if time!=. , firstpart(probit) secondpart(glm, family(gamma) link(log)) re vce(cluster study_group_id)
estimates store atpm
margins, dydx(tx)
margins, dydx(*) //2.4377 0.050
margins, at(time=(2(1)7)) by(tx) 
marginsplot //<--FIGURE 1
esttab atpm using "$sourcedir\tables2.csv", replace se starl( * 0.10 ** 0.05 *** 0.010) varwidth(25) label  interaction(" X ") title(Table 2: Pooled Regression Analysis) legend varlabels(_cons constant) stats(r2 df_r bic, fmt(3 0 1) label(R-sqr dfres BIC))

*adjusted
twopm total_shares_purchased_usd tx##time $xlist if time!=. , firstpart(probit) secondpart(glm, family(gamma) link(log)) re vce(cluster study_group_id) 
estimates store atpm
margins if tx==0
margins if tx==1
margins, dydx(tx) 
margins, dydx(*) 
//2.526499, 0.044
margins, at(time=(2(1)7)) by(tx) 
marginsplot
esttab atpm using "$sourcedir\tables2.csv", replace se starl( * 0.10 ** 0.05 *** 0.010) varwidth(25) label  interaction(" X ") title(Table 2: Pooled Regression Analysis) legend varlabels(_cons constant) stats(r2 df_r bic, fmt(3 0 1) label(R-sqr dfres BIC))

esttab atpm using "$sourcedir\tables2b.csv", replace ci starl( * 0.10 ** 0.05 *** 0.010) varwidth(25) label  interaction(" X ") title(Table 2: Pooled Regression Analysis) legend varlabels(_cons constant) stats(r2 df_r bic, fmt(3 0 1) label(R-sqr dfres BIC))


		
		
****negative binomial model for outcome: defaulted since last encounter
global xlist age i.female_bin i.educ_bin i.income_bin gishe_months no_of_meetings_attended WalkingDistancekm group_active_total meeting_freq loan_bal_outstanding 

gen loanbal_yn = 1 if loan_bal_outstanding >0 & loan_bal_outstanding !=. 
replace loanbal_yn = 0 if loanbal_yn!=1

glm defaulted_since_last_encounter i.tx##i.time  $xlist if time!=1 & loan_bal_outstanding >0, family(nbinomial) link(log) vce(cluster study_group_id) 
glm defaulted_since_last_encounter i.tx##i.time  $xlist if time!=1 & loan_bal_outstanding >0, family(nbinomial) link(log) vce(cluster study_group_id) eform
estimates store A 
estat ic
outreg2 using supTableone.doc, addstat(Log-likelihood,`e(ll)', parameters, `e(k)')  ctitle(NBR_all)

	**FIGURE 2. 
	local coefinter 1.tx 1.tx#2.time 1.tx#3.time 1.tx#4.time 1.tx#5.time 1.tx#6.time 1.tx#7.time
	coefplot 	(A, label(Tx=1, all) pstyle(p3)), ///
				drop(_cons) keep(`coefinter') xline(0) 
				
	local coefinter 1.tx 1.tx#2.time 1.tx#3.time 1.tx#4.time 1.tx#5.time 1.tx#6.time 1.tx#7.time
	coefplot 	(A, label(Tx=1, all) pstyle(p3)), ///
				drop(_cons) keep(`coefinter') xline(1) eform xtitle(Incidence Rate Ratio) 

**Supplementary Table 4. 
*stratify by income	
glm defaulted_since_last_encounter tx##time $xlist if income_bin==1 & time!=1   , family(nbinomial) link(log) vce(cluster study_group_id)  
estat ic
estimates store B
outreg2 using supTableone.doc,  addstat(Log-likelihood,`e(ll)', parameters, `e(k)') append ctitle(income>=50USD per month)

glm defaulted_since_last_encounter tx##time $xlist if income_bin==0 & time!=1  , family(nbinomial) link(log) vce(cluster study_group_id)   
estimates store C 
estat ic
outreg2 using supTableone.doc,  addstat(Log-likelihood,`e(ll)', parameters, `e(k)') append ctitle(income<50USD per month) 

*stratify by distance	
capture drop distance_bin
gen distance_bin=1 if DrivingDistancekm>20
replace distance_bin=0 if DrivingDistancekm<=20

glm defaulted_since_last_encounter tx##time $xlist if distance_bin==1 & time!=1   , family(nbinomial) link(log) vce(cluster study_group_id)  
estat ic
estimates store D
outreg2 using supTableone.doc,  addstat(Log-likelihood,`e(ll)', parameters, `e(k)') append ctitle(NBR_fardistance)

glm defaulted_since_last_encounter tx##time $xlist if distance_bin==0 & time!=1  , family(nbinomial) link(log) vce(cluster study_group_id)   
estimates store E 
estat ic
outreg2 using supTableone.doc,  addstat(Log-likelihood,`e(ll)', parameters, `e(k)') append ctitle(NBR_closedistance) 

**Supplementary Table 3. # of loan defaults at each time point, by arm
tab defaulted_since_last_encounter time if time!=1 & loan_bal_outstanding >0, col		
bysort arm: tab defaulted_since_last_encounter time if time!=1 & loan_bal_outstanding >0, col		
 
***for what purpose did individuals use their loan				
use "P:\Omar Projects\Harambee R01\Aim 3\SSM Manuscript - Analysis of GISHE Performance\Output\18MOSGishe_rev.dta", clear
preserve	
keep if time==2 & loanbal_yn==1
table1_mc, 	by(arm) /// 
			vars(		loan_purpose cat	)		nospace percent_n onecol missing total(before) test saving("$sourcedir\table S2a.xlsx",replace)
restore	

preserve	
keep if time==7 & loanbal_yn==1
table1_mc, 	by(arm) /// 
			vars(		loan_purpose cat	)		nospace percent_n onecol missing total(before) test saving("$sourcedir\table S2b.xlsx",replace)
restore	




clear
log close	